import pandas as pd
df= pd.read_csv('retail_combined_data.csv')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
customer_encoder = LabelEncoder()
product_encoder = LabelEncoder()
df["customer_id"] = customer_encoder.fit_transform(df["customer_id"])
df["product_id"] = product_encoder.fit_transform(df["product_id"])
df
customer_id product_id purchase_score name description
0 269 2 5 Headphones Noise-canceling wireless headphones
1 53 4 2 Smartwatch Smartwatch with health tracking features
2 412 3 4 Camera DSLR camera with 4K video recording
3 130 0 5 Laptop High-performance laptop with latest processor
4 609 1 4 Smartphone Feature-rich smartphone with excellent camera
... ... ... ... ... ...
19995 52 3 2 Camera DSLR camera with 4K video recording
19996 788 2 4 Headphones Noise-canceling wireless headphones
19997 396 1 2 Smartphone Feature-rich smartphone with excellent camera
19998 193 3 3 Camera DSLR camera with 4K video recording
19999 398 3 1 Camera DSLR camera with 4K video recording

20000 rows × 5 columns

df['product_id'].unique()
array([2, 4, 3, 0, 1], dtype=int64)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

Collaborative filtering

num_customers = df["customer_id"].nunique()+1
num_products = df["product_id"].nunique()+1
embedding_dim = 10
# Input layers
customer_input = Input(shape=(1,), name="customer_input")
product_input = Input(shape=(1,), name="product_input")

# Embeddings
customer_embedding = Embedding(num_customers, embedding_dim, name="customer_embedding")(customer_input)
product_embedding = Embedding(num_products, embedding_dim, name="product_embedding")(product_input)
customer_vec = Flatten()(customer_embedding)
product_vec = Flatten()(product_embedding)

# Merge
collab_layer = Concatenate()([customer_vec, product_vec])
dense_layer = Dense(128, activation='relu')(collab_layer)
dense_layer = Dense(64, activation='relu')(dense_layer)
collab_output = Dense(1, activation='linear', name="collab_output")(dense_layer)

collab_model = Model(inputs=[customer_input, product_input], outputs=collab_output)
collab_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
collab_model.fit(
    [train_data['customer_id'], train_data['product_id']], train_data['purchase_score'],
    validation_data=([test_data['customer_id'], test_data['product_id']], test_data['purchase_score']),
    epochs=1, batch_size=200
)
80/80 ━━━━━━━━━━━━━━━━━━━━ 5s 13ms/step - loss: 7.4246 - mae: 2.3178 - val_loss: 2.0279 - val_mae: 1.2307
<keras.src.callbacks.history.History at 0x2154008d110>

Content filtering

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
raw_text = np.hstack([df.name.str.lower(), df.description.str.lower()])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(raw_text)
description_token= tokenizer.texts_to_sequences(df.description.str.lower())
name_token = tokenizer.texts_to_sequences(df.name.str.lower())
#description_token_test= tokenizer.texts_to_sequences(test_data.description.str.lower())
#name_token_test = tokenizer.texts_to_sequences(test_data.name.str.lower())
max_description_token_length = pd.Series(description_token).map(len).max()
max_name_token_length = pd.Series(name_token).map(len).max()
vocab_size = np.max(np.concatenate([np.concatenate(description_token),np.concatenate(name_token)]))+1
from tensorflow.keras.preprocessing.sequence import pad_sequences
desc_padded = pad_sequences(description_token, maxlen=5)
name_padded = pad_sequences(name_token, maxlen=2)
# Content based model
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
desc_input = Input(shape=(5,), name="desc_input")
name_input= Input(shape=(2,), name="name_input")
desc_embedding = Embedding(input_dim=vocab_size, output_dim=5)(desc_input)
name_embedding = Embedding(input_dim=vocab_size, output_dim=5)(name_input)
desc_rnn = GRU(16)(desc_embedding)
name_rnn = GRU(8)(name_embedding)
merged = Concatenate()([desc_rnn, name_rnn])
dense_layer = Dense(64, activation='relu')(merged)
dense_layer = Dense(64, activation='relu')(dense_layer)
content_output = Dense(1, activation='linear', name="content_output")(dense_layer)
name_train, name_test, desc_train, desc_test = train_test_split(
    name_padded, desc_padded, test_size=0.2, random_state=42
)
print(name_train.shape)
print(desc_train.shape) 
print(train_data['purchase_score'].shape) 
(16000, 2)
(16000, 5)
(16000,)
# content based model
content_model = Model(inputs=[name_input,desc_input], outputs=content_output)
content_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
train_data['purchase_score']= np.array(train_data['purchase_score'])
content_model.fit([name_train,desc_train], train_data['purchase_score'],validation_data=([name_test,desc_test],
                                                            test_data['purchase_score']),epochs=1, batch_size=2)
8000/8000 ━━━━━━━━━━━━━━━━━━━━ 63s 7ms/step - loss: 2.2415 - mae: 1.2799 - val_loss: 2.0064 - val_mae: 1.2101
<keras.src.callbacks.history.History at 0x21543fbdc90>

Hybrid filtering

# Combine collaborative and content models
merged_layer = Concatenate()([collab_output, content_output])
hybrid_dense = Dense(64, activation='relu')(merged_layer)
hybrid_output = Dense(1, activation='linear', name="hybrid_output")(hybrid_dense)

hybrid_model = Model(inputs=[customer_input, product_input,name_input, desc_input], outputs=hybrid_output)
hybrid_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

hybrid_model.fit(
    [train_data['customer_id'], train_data['product_id'],name_train, desc_train],
    train_data['purchase_score'],
    validation_data=([test_data['customer_id'], test_data['product_id'],name_test, desc_test], test_data['purchase_score']),
    epochs=2, batch_size=20
)
Epoch 1/2
800/800 ━━━━━━━━━━━━━━━━━━━━ 25s 12ms/step - loss: 2.4399 - mae: 1.3189 - val_loss: 2.0067 - val_mae: 1.2160
Epoch 2/2
800/800 ━━━━━━━━━━━━━━━━━━━━ 8s 8ms/step - loss: 2.0284 - mae: 1.2332 - val_loss: 2.0411 - val_mae: 1.2343
<keras.src.callbacks.history.History at 0x2155275e590>
customer_map = {customer_id: idx for idx, customer_id in enumerate(train_data['customer_id'].unique())}
product_map = {product_id: idx for idx, product_id in enumerate(train_data['product_id'].unique())}

def recommend_products(customer_id, top_n=3):
    customer_index = customer_map.get(customer_id, None)
    if customer_index is None:
        return "Customer not found."
        
    product_indices = np.array(list(product_map.values()))

    # Select only relevant padded sequences
    filtered_names = name_padded[product_indices]
    filtered_descs = desc_padded[product_indices]

    # Predict content scores for only mapped products
    content_scores = content_model.predict([filtered_names, filtered_descs]).flatten()
    
    # Predict scores for all products
    collab_scores = collab_model.predict([np.array([customer_index] * len(all_products)), all_products]).flatten()
    
    # Predict content scores
     content_scores = content_model.predict([name_padded,desc_padded]).flatten()
    
    # Hybrid score: weighted sum of collaborative & content scores
    hybrid_scores = 0.5 * collab_scores + 0.5 * content_scores

    # Get top N recommendations
    top_indices = np.argsort(-hybrid_scores)[:top_n]
    recommended_products = [list(product_map.keys())[i] for i in top_indices]

    return recommended_products
# Get recommendations
print(recommend_products(270, top_n=2))
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 83ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 88ms/step
[2, 4]